import numpy as np
from tqdm import tqdm
import os, sys
import joblib, pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV

# Read the data directory from the file using pickle
with open('data_directory.pickle', 'rb') as file:
    data_directory = pickle.load(file)
train_dataset = np.load(f'{data_directory}//train_dataset.npy')

# Read the model directory from the file using pickle
with open('model_directory.pickle', 'rb') as file:
    model_directory = pickle.load(file)

# Number of trees in random forest
n_estimators = list(np.linspace(start = 100, stop = 600, num = 100).astype(int))
# The function to measure the quality of a split
criterion = ['gini','entropy']
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = list(np.linspace(2, 80, num = 20).astype(int))
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

def binary_classifier(X_train, y_train):
    pipeline = Pipeline([('scaler', StandardScaler()),
                         ('rf', BayesSearchCV(estimator = RandomForestClassifier(), search_spaces = random_grid,
                                                n_iter = 32, scoring = 'f1', n_points = 32,
                                                refit = True, cv = 5, verbose=0, n_jobs = 12))])
    pipeline.fit(X_train, y_train)
    best_score = pipeline['rf'].best_score_
    return pipeline, best_score

def training_data(pair):
    train_data_1 = train_dataset[pair[0]];  train_data_2 = train_dataset[pair[1]]
    train_data_1 = train_data_1[~np.isnan(train_data_1).any(axis=1)]
    train_data_2 = train_data_2[~np.isnan(train_data_2).any(axis=1)]
    y_train = np.array(train_data_1.shape[0] * [0] + train_data_2.shape[0] * [1])
    X_train = np.concatenate((train_data_1, train_data_2), axis=0)
    return X_train, y_train

def build_library(user_pairs):
    model_scores = []
    pbar_user_pairs = tqdm(user_pairs, unit='model', ncols=100, position=pos+1, ascii=" |", leave=False)
    for user_pair in pbar_user_pairs:
        X_train, y_train = training_data(user_pair)
        model, model_score = binary_classifier(X_train, y_train)
        model_scores.append(model_score)
        filename = f'{model_directory}//{user_pair[0]}_{user_pair[1]}.joblib'
        joblib.dump(model, filename)
    return model_scores

fname = sys.argv[1]
pos = int(sys.argv[2])
user_pairs = np.load(fname)
model_scores = build_library(user_pairs)
np.save(f'model_building_partition_files//score_{pos}.npy', model_scores)